library(wordbankr)
ls("package:wordbankr")
## [1] "fit_aoa" "get_administration_data" ## [3] "get_crossling_data" "get_crossling_items" ## [5] "get_instrument_data" "get_instruments" ## [7] "get_item_data" "get_sources" ## [9] "summarise_items"
intruments <- get_instruments() intruments
sources <- get_sources() sources
get_sources(language = "English (American)")
admins_eng_ws <- get_administration_data(language = "English (American)", form = "WS") admins_eng_ws
n_distinct(admins_eng_ws$data_id)
## [1] 5520
admins_eng_ws %>% count(age)
ggplot(admins_eng_ws, aes(x = age, y = production)) + geom_jitter(colour = "grey", size = 0.5) + geom_smooth()
admins_russian <- get_administration_data(language = "Russian") admins_russian
admins_ws <- get_administration_data(form = "WS") admins_ws
admins <- get_administration_data() admins
nrow(admins)
## [1] 82055
admins %>% count(language, form)
items_eng_ws <- get_item_data(language = "English (American)", form = "WS") items_eng_ws
items_eng_ws %>% distinct(type)
items_eng_ws %>% distinct(category)
items_eng_ws %>% distinct(lexical_category)
items <- get_item_data() items
items %>% count(language, form)
ids <- items_eng_ws %>%
filter(definition %in% c("dog", "cat")) %>%
pull(item_id)
get_instrument_data(language = "English (American)", form = "WS", items = ids)
get_instrument_data(language = "English (American)", form = "WS",
items = ids, administrations = TRUE, iteminfo = TRUE)
twos <- admins_eng_ws %>% filter(age == 24)
dog_cat <- items_eng_ws %>% filter(definition %in% c("dog", "cat"))
get_instrument_data(language = "English (American)", form = "WS", items = ids,
administrations = twos, iteminfo = dog_cat)
Exercises
Compute and plot median productive vocabulary size (as proportion of total words) over age in each language. Limit to WS data for children 16-30 months old (hint: left_join and facet_wrap are likely to be helpful).
For English WS data, compute and plot the proportion of children that produce each word in the “toys” category at each age.
Median vocabulary sizes
num_words <- items %>% filter(form == "WS", type == "word") %>% group_by(language) %>% summarise(words = n())
vocab_summary <- admins %>% filter(form == "WS", age >= 16, age <= 30) %>% left_join(num_words) %>% mutate(prop_vocab = production / words) %>% group_by(language, age) %>% summarise(median_vocab = median(prop_vocab))
ggplot(vocab_summary, aes(x = age, y = median_vocab)) + facet_wrap(~language) + geom_point() + ylim(0, 1) + labs(x = "Age (months)", y = "Productive vocabulary size")
Toy trajectories
toys <- items_eng_ws %>%
filter(type == "word", category == "toys")
toys_data <- get_instrument_data(language = "English (American)", form = "WS",
items = toys$item_id,
administrations = admins_eng_ws,
iteminfo = toys) %>%
mutate(produces = !is.na(value) & value == "produces")
toys_summary <- toys_data %>%
group_by(definition, age) %>%
summarise(total = n(),
prop_produces = sum(produces) / total)
ggplot(toys_summary, aes(x = age, y = prop_produces)) +
facet_wrap(~definition) +
geom_smooth(aes(weight = total), method = "glm", se = FALSE,
method.args = list(family = "binomial")) +
labs(x = "Age (months)", y = "Proportion of children producing")
Toy trajectories by sex
toys_summary_sex <- toys_data %>%
filter(!is.na(sex)) %>%
group_by(definition, age, sex) %>%
summarise(total = n(),
prop_produces = sum(produces) / total)
ggplot(toys_summary_sex, aes(x = age, y = prop_produces, colour = sex)) +
facet_wrap(~definition) +
geom_smooth(aes(weight = total), method = "glm", se = FALSE,
method.args = list(family = "binomial")) +
scale_colour_ptol(name = "") +
labs(x = "Age (months)", y = "Proportion of children producing") +
theme(legend.position = "top")
fit_aoa(toys_data) %>% select(definition, aoa)
fit_aoa(toys_data, method = "glmrob", proportion = 0.8) %>% select(definition, aoa)
get_crossling_items()
get_crossling_data(uni_lemmas = "dog")
library(childesr)
help(package = "childesr")
ls("package:childesr")
## [1] "connect_to_childes" "get_collections" ## [3] "get_contexts" "get_corpora" ## [5] "get_database_version" "get_participants" ## [7] "get_speaker_statistics" "get_tokens" ## [9] "get_transcripts" "get_types" ## [11] "get_utterances"
get_collections()
get_corpora()
get_transcripts(collection = "Eng-NA")
get_transcripts(corpus = c("Brown", "Clark"))
get_participants(corpus = "Clark")
get_participants(collection = "Eng-NA", age = c(24, 36))
get_utterances(corpus = "Clark", role = "target_child")
get_utterances(corpus = "Clark", role_exclude = "target_child")
get_types(corpus = "Clark", type = "dog")
get_types(collection = "Eng-NA", role = "target_child", type = "dog")
get_tokens(corpus = "Clark", role = "target_child", token = "dog")
get_tokens(corpus = "Clark", role = "target_child", token = "dog", replace = FALSE)
get_tokens(corpus = "Clark", role = "target_child", token = c("dog", "cat"))
get_tokens(corpus = "Clark", role = "target_child", token = "dog%")
get_tokens(corpus = "Clark", role = "target_child", token = "*", stem = "run")
get_tokens(corpus = "Clark", role = "target_child", token = "*", part_of_speech = "v")
brown_stats <- get_speaker_statistics(corpus = "Brown", role = "target_child") brown_stats
ggplot(brown_stats, aes(x = target_child_age, y = mlu_w, colour = target_child_name)) + geom_point() + geom_smooth(method = "lm", se = FALSE) + scale_colour_ptol(name = "")
Exercises
For each corpus, compute the number of transcripts in it. Make a histogram of these counts.
For each corpus, compute the mean length of its transcripts in number of tokens spoken by everyone other than the target child. Plot these means againt the number of transcripts from 1.
Retrieve and plot the number of times each child in the Brown corpus said each inflection of the verb “go” over age.
Transcript stats
transcripts <- get_transcripts() corpus_transcripts <- transcripts %>% group_by(corpus_id, corpus_name, language) %>% summarise(num_transcripts = n())
ggplot(corpus_transcripts, aes(x = num_transcripts)) + geom_histogram() + labs(x = "Number of transcripts in corpus", y = "Count of corpora")
Transcript stats
speaker_stats <- get_speaker_statistics(role_exclude = "target_child") corpus_tokens <- speaker_stats %>% group_by(corpus_id) %>% summarise(mean_tokens = mean(num_tokens)) %>% left_join(corpus_transcripts)
ggplot(corpus_tokens, aes(x = num_transcripts, y = mean_tokens)) + geom_point() + labs(x = "Number of transcripts in corpus", y = "Mean transcript length")
Brown “go” frequencies
go_tokens <- get_tokens(corpus = "Brown", role = "target_child",
stem = "go", token = "*", replace = FALSE)
go_summary <- go_tokens %>% group_by(target_child_name) %>% mutate(age = floor(target_child_age)) %>% group_by(target_child_name, age, gloss) %>% summarise(num_tokens = n())
ggplot(go_summary, aes(x = age, y = num_tokens, colour = gloss)) + facet_wrap(~target_child_name, scales = "free") + geom_line() + scale_colour_ptol(name = "") + labs(x = "Age (months)", y = "Number of tokens spoken by child") + theme(legend.position = "top")
Brown “go” frequencies
Use data from Wordbank and/or childes-db to explore some question about language learning. Here’s a few ideas:
Wordbank
– Explore the relationship between vocabulary size and grammar ability (the items of type complexity).
– Look at the composition of vocabulary – what proportion of words that children know are which lexical category – and how it changes over age.
childes-db
– Characterize the developmental trajectory of children’s lexical diversity (e.g. MTLD) and how it differs by gender.
– Estimate the frequencies of color terms (or some other interesting set of words) in speech to children over age.
Both
– For some set of words, estimate their age of acquisition from Wordbank and frequency in child-directed speech from childes-db and examine the relationship between them.
– Determine which words are earliest-learned according to CDI data and according to corpus data and compare the two.
Wordbank
– wordbank.stanford.edu
– github.com/langcog/wordbankr
– langcog.github.io/wordbankr
– mb-cdi.stanford.edu
Citation: Frank, M. C., Braginsky, M., Yurovsky, D., & Marchman, V. A. (2017). Wordbank: An open repository for developmental vocabulary data. Journal of Child Language, 44(3), 677-694.
childes-db
– childes-db.stanford.edu/
– github.com/langcog/childesr
– childes.talkbank.org
Citation: Sanchez, A., Meylan, S. C., Braginsky, M., MacDonald, K. E., Yurovsky, D., & Frank, M. C. (2019). childes-db: A flexible and reproducible interface to the Child Language Data Exchange System. Behavior Research Methods, 1-14.
This presentation
– github.com/mikabr/acq-tools
– mikabr.github.io/acq-tools
Contact: mikabr@mit.edu, mcfrank@stanford.edu